home *** CD-ROM | disk | FTP | other *** search
/ EuroCD 3 / EuroCD 3.iso / Programming / vbcc / machines / amiga68k / libsrc / math / math_040 / tan.s < prev    next >
Encoding:
Text File  |  1998-06-24  |  14.9 KB  |  374 lines

  1. *
  2. *   $VER: tan.s 33.1 (22.1.97)
  3. *
  4. *   Calculates the tangent or the cotangent of the source operand
  5. *
  6. *   Version history:
  7. *
  8. *   33.1    22.1.97 (c) Motorola
  9. *
  10. *           - snipped from M68060SP
  11. *
  12. *   33.2    23.1.97 laukkanen
  13. *
  14. *           - added cot(), well that was the second solution I thought
  15. *             after 1/tan() which didn't seem very fast but I'm no
  16. *             "mathemagician" so if anybody wants to object, please do.
  17. *
  18. *   33.3    24.1.97 laukkanen
  19. *
  20. *           - the reduce part of tan() was broken, fixed.
  21. *
  22.  
  23.     machine 68040
  24.     fpu     1
  25.  
  26.     XDEF    _tan
  27.     XDEF    @tan
  28.     XDEF    _cot
  29.     XDEF    @cot
  30.  
  31.     XREF    PITBL
  32.  
  33. *************************************************************************
  34. * tan():   computes the tangent of a normalized input                   *
  35. * cot():   computes the cotangent of a normalized input                 *
  36. *                                                                       *
  37. * INPUT *************************************************************** *
  38. *       fp0 = pointer to extended precision input                       *
  39. *                                                                       *
  40. * OUTPUT ************************************************************** *
  41. *       fp0 = tan(X) or fp0 = cot(X)                                    *
  42. *                                                                       *
  43. * ACCURACY and MONOTONICITY ******************************************* *
  44. *       The returned result is within 3 ulp in 64 significant bit, i.e. *
  45. *       within 0.5001 ulp to 53 bits if the result is subsequently      *
  46. *       rounded to double precision. The result is provably monotonic   *
  47. *       in double precision.                                            *
  48. *                                                                       *
  49. * ALGORITHM *********************************************************** *
  50. *                                                                       *
  51. *       1. If |X| >= 15Pi or |X| < 2**(-40), go to 6.                   *
  52. *                                                                       *
  53. *       2. Decompose X as X = N(Pi/2) + r where |r| <= Pi/4. Let        *
  54. *               k = N mod 2, so in particular, k = 0 or 1.              *
  55. *                                                                       *
  56. *       3. If k is odd, go to 5.                                        *
  57. *                                                                       *
  58. *       4. (k is even) Tan(X) = tan(r) and tan(r) is approximated by a  *
  59. *               rational function U/V where                             *
  60. *               U = r + r*s*(P1 + s*(P2 + s*P3)), and                   *
  61. *               V = 1 + s*(Q1 + s*(Q2 + s*(Q3 + s*Q4))),  s = r*r.      *
  62. *               Exit.                                                   *
  63. *                                                                       *
  64. *       4. (k is odd) Tan(X) = -cot(r). Since tan(r) is approximated by *
  65. *               a rational function U/V where                           *
  66. *               U = r + r*s*(P1 + s*(P2 + s*P3)), and                   *
  67. *               V = 1 + s*(Q1 + s*(Q2 + s*(Q3 + s*Q4))), s = r*r,       *
  68. *               -Cot(r) = -V/U. Exit.                                   *
  69. *                                                                       *
  70. *       6. If |X| > 1, go to 8.                                         *
  71. *                                                                       *
  72. *       7. (|X|<2**(-40)) Tan(X) = X. Exit.                             *
  73. *                                                                       *
  74. *       8. Overwrite X by X := X rem 2Pi. Now that |X| <= Pi, go back   *
  75. *               to 2.                                                   *
  76. *                                                                       *
  77. *************************************************************************
  78.  
  79. TANQ4   dc.l            $3EA0B759,$F50F8688
  80. TANP3   dc.l            $BEF2BAA5,$A8924F04
  81. TANQ3   dc.l            $BF346F59,$B39BA65F,$00000000,$00000000
  82. TANP2   dc.l            $3FF60000,$E073D3FC,$199C4A00,$00000000
  83. TANQ2   dc.l            $3FF90000,$D23CD684,$15D95FA1,$00000000
  84. TANP1   dc.l            $BFFC0000,$8895A6C5,$FB423BCA,$00000000
  85. TANQ1   dc.l            $BFFD0000,$EEF57E0D,$A84BC8CE,$00000000
  86. INVTWOPI dc.l           $3FFC0000,$A2F9836E,$4E44152A,$00000000
  87. TWOPI1  dc.l            $40010000,$C90FDAA2,$00000000,$00000000
  88. TWOPI2  dc.l            $3FDF0000,$85A308D4,$00000000,$00000000
  89. TWOBYPI dc.l            $3FE45F30,$6DC9C883
  90. PID2    dc.d            1.57079632679489661923
  91.  
  92. _tan
  93.         fmove.d         (4,sp),fp0
  94. @tan
  95.         fmove.x         fp0,-(sp)
  96.         move.l          (sp),d1
  97.         move.w          (4,sp),d1
  98.         and.l           #$7FFFFFFF,d1
  99.         lea             (12,sp),sp
  100.  
  101.         cmp.l           #$3FD78000,d1           ; |X| >= 2**(-40)?
  102.         bge.b           .TANOK1
  103.         bra.w           .TANSM
  104. .TANOK1
  105.         cmp.l           #$4004BC7E,d1           ; |X| < 15 PI?
  106.         blt.b           .TANMAIN
  107.         bra.w           .REDUCEX
  108.  
  109. .TANMAIN
  110. ;--THIS IS THE USUAL CASE, |X| <= 15 PI.
  111. ;--THE ARGUMENT REDUCTION IS DONE BY TABLE LOOK UP.
  112.         fmove.x         fp0,fp1
  113.         fmul.d          (TWOBYPI,pc),fp1        ; X*2/PI
  114.  
  115.         lea             (PITBL+$200,pc),a1      ; TABLE OF N*PI/2, N = -32,...,32
  116.  
  117.         fmove.l         fp1,d1                  ; CONVERT TO INTEGER
  118.  
  119.         asl.l           #4,d1
  120.         add.l           d1,a1                   ; ADDRESS N*PIBY2 IN Y1, Y2
  121.  
  122.         fsub.x          (a1)+,fp0               ; X-Y1
  123.  
  124.         fsub.s          (a1),fp0                ; FP0 IS R = (X-Y1)-Y2
  125.  
  126.         ror.l           #5,d1
  127.         and.l           #$80000000,d1           ; D0 WAS ODD IFF D0 < 0
  128.  
  129. .TANCONT
  130.         fmovem.x        fp2/fp3,-(sp)           ; save fp2,fp3
  131.  
  132.         tst.l           d1
  133.         blt.w           .NODD
  134.  
  135.         fmove.x         fp0,fp1
  136.         fmul.x          fp1,fp1                 ; S = R*R
  137.  
  138.         fmove.d         (TANQ4,pc),fp3
  139.         fmove.d         (TANP3,pc),fp2
  140.         fmul.x          fp1,fp3                 ; SQ4
  141.         fmul.x          fp1,fp2                 ; SP3
  142.  
  143.         fadd.d          (TANQ3,pc),fp3          ; Q3+SQ4
  144.         fadd.x          (TANP2,pc),fp2          ; P2+SP3
  145.  
  146.         fmul.x          fp1,fp3                 ; S(Q3+SQ4)
  147.         fmul.x          fp1,fp2                 ; S(P2+SP3)
  148.  
  149.         fadd.x          (TANQ2,pc),fp3          ; Q2+S(Q3+SQ4)
  150.         fadd.x          (TANP1,pc),fp2          ; P1+S(P2+SP3)
  151.  
  152.         fmul.x          fp1,fp3                 ; S(Q2+S(Q3+SQ4))
  153.         fmul.x          fp1,fp2                 ; S(P1+S(P2+SP3))
  154.  
  155.         fadd.x          (TANQ1,pc),fp3          ; Q1+S(Q2+S(Q3+SQ4))
  156.         fmul.x          fp0,fp2                 ; RS(P1+S(P2+SP3))
  157.  
  158.         fmul.x          fp3,fp1                 ; S(Q1+S(Q2+S(Q3+SQ4)))
  159.  
  160.         fadd.x          fp2,fp0                 ; R+RS(P1+S(P2+SP3))
  161.  
  162.         fadd.s          #$3F800000,fp1          ; 1+S(Q1+...)
  163.  
  164.         fmovem.x        (sp)+,fp2/fp3           ; restore fp2,fp3
  165.  
  166.         fdiv.x          fp1,fp0                 ; last inst - possible exception set
  167.         rts
  168.  
  169. .NODD
  170.         fmove.x         fp0,fp1
  171.         fmul.x          fp0,fp0                 ; S = R*R
  172.  
  173.         fmove.d         (TANQ4,pc),fp3
  174.         fmove.d         (TANP3,pc),fp2
  175.  
  176.         fmul.x          fp0,fp3                 ; SQ4
  177.         fmul.x          fp0,fp2                 ; SP3
  178.  
  179.         fadd.d          (TANQ3,pc),fp3          ; Q3+SQ4
  180.         fadd.x          (TANP2,pc),fp2          ; P2+SP3
  181.  
  182.         fmul.x          fp0,fp3                 ; S(Q3+SQ4)
  183.         fmul.x          fp0,fp2                 ; S(P2+SP3)
  184.  
  185.         fadd.x          (TANQ2,pc),fp3          ; Q2+S(Q3+SQ4)
  186.         fadd.x          (TANP1,pc),fp2          ; P1+S(P2+SP3)
  187.         fmul.x          fp0,fp3                 ; S(Q2+S(Q3+SQ4))
  188.         fmul.x          fp0,fp2                 ; S(P1+S(P2+SP3))
  189.  
  190.         fadd.x          (TANQ1,pc),fp3          ; Q1+S(Q2+S(Q3+SQ4))
  191.         fmul.x          fp1,fp2                 ; RS(P1+S(P2+SP3))
  192.  
  193.         fmul.x          fp3,fp0                 ; S(Q1+S(Q2+S(Q3+SQ4)))
  194.  
  195.         fadd.x          fp2,fp1                 ; R+RS(P1+S(P2+SP3))
  196.         fadd.s          #$3F800000,fp0          ; 1+S(Q1+...)
  197.  
  198.         fmovem.x        (sp)+,fp2/fp3           ; restore fp2,fp3
  199.  
  200.         fmove.x         fp1,-(sp)
  201.         eor.l           #$80000000,(sp)
  202.  
  203.         fdiv.x          (sp)+,fp0               ; last inst - possible exception set
  204.         rts
  205.  
  206. .TANSM
  207.         rts
  208.  
  209. FP_SCR0         EQU     -12
  210. FP_SCR0_EX      EQU     -12
  211. FP_SCR0_HI      EQU     -8
  212. FP_SCR0_LO      EQU     -4
  213. FP_SCR1         EQU     -24
  214. FP_SCR1_EX      EQU     -24
  215. FP_SCR1_HI      EQU     -20
  216. FP_SCR1_LO      EQU     -16
  217. INARG       EQU         -12
  218. ENDFLAG     EQU         -28
  219. TWOTO63     EQU         -32
  220. INT         EQU         -28
  221.  
  222. REDUCE_SIZE EQU         32
  223.  
  224. ;--WHEN REDUCEX IS USED, THE CODE WILL INEVITABLY BE SLOW.
  225. ;--THIS REDUCTION METHOD, HOWEVER, IS MUCH FASTER THAN USING
  226. ;--THE REMAINDER INSTRUCTION WHICH IS NOW IN SOFTWARE.
  227. .REDUCEX
  228.         fmovem.x        fp2-fp5,-(sp)           ; save {fp2-fp5}
  229.         move.l          d2,-(sp)                ; save d2
  230.         link            a0,#-REDUCE_SIZE
  231.         fmove.s         #$00000000,fp1          ; fp1 = 0
  232.  
  233. ;--If compact form of abs(arg) in d0=$7ffeffff, argument is so large that
  234. ;--there is a danger of unwanted overflow in first LOOP iteration.  In this
  235. ;--case, reduce argument by one remainder step to make subsequent reduction
  236. ;--safe.
  237.         cmp.l           #$7ffeffff,d1           ; is arg dangerously large?
  238.         bne.b           .LOOP                   ; no
  239.  
  240. ; yes; create 2**16383*PI/2
  241.         move.w          #$7ffe,(FP_SCR0_EX,a0)
  242.         move.l          #$c90fdaa2,(FP_SCR0_HI,a0)
  243.         clr.l           (FP_SCR0_LO,a0)
  244.  
  245. ; create low half of 2**16383*PI/2 at FP_SCR1
  246.         move.w          #$7fdc,(FP_SCR1_EX,a0)
  247.         move.l          #$85a308d3,(FP_SCR1_HI,a0)
  248.         clr.l           (FP_SCR1_LO,a0)
  249.  
  250.         fcmp.s          #0,fp0                  ; test sign of argument
  251.         fblt.w          .red_neg
  252.  
  253.         or.b            #$80,(FP_SCR0_EX,a0)    ; positive arg
  254.         or.b            #$80,(FP_SCR1_EX,a0)
  255. .red_neg
  256.         fadd.x          (FP_SCR0,a0),fp0        ; high part of reduction is exact
  257.         fmove.x         fp0,fp1                 ; save high result in fp1
  258.         fadd.x          (FP_SCR1,a0),fp0        ; low part of reduction
  259.         fsub.x          fp0,fp1                 ; determine low component of result
  260.         fadd.x          (FP_SCR1,a0),fp1        ; fp0/fp1 are reduced argument.
  261.  
  262. ;--ON ENTRY, FP0 IS X, ON RETURN, FP0 IS X REM PI/2, |X| <= PI/4.
  263. ;--integer quotient will be stored in N
  264. ;--Intermeditate remainder is 66-bit dc.l; (R,r) in (FP0,FP1)
  265. .LOOP
  266.         fmove.x         fp0,(INARG,a0)          ; +-2**K * F, 1 <= F < 2
  267.         move.w          (INARG,a0),d1
  268.         move.l          d1,a1                   ; save a copy of D0
  269.         and.l           #$00007FFF,d1
  270.         sub.l           #$00003FFF,d1           ; d0 = K
  271.         cmp.l           #28,d1
  272.         ble.b           .LASTLOOP
  273. .CONTLOOP
  274.         sub.l           #27,d1                  ; d0 = L := K-27
  275.         clr.b           (ENDFLAG,a0)
  276.         bra.b           .WORK
  277. .LASTLOOP
  278.         clr.l           d1                      ; d0 = L := 0
  279.         move.b          #1,(ENDFLAG,a0)
  280.  
  281. .WORK
  282. ;--FIND THE REMAINDER OF (R,r) W.R.T.   2**L * (PI/2). L IS SO CHOSEN
  283. ;--THAT INT( X * (2/PI) / 2**(L) ) < 2**29.
  284.  
  285. ;--CREATE 2**(-L) * (2/PI), SIGN(INARG)*2**(63),
  286. ;--2**L * (PIby2_1), 2**L * (PIby2_2)
  287.  
  288.         move.l          #$00003FFE,d2           ; BIASED EXP OF 2/PI
  289.         sub.l           d1,d2                   ; BIASED EXP OF 2**(-L)*(2/PI)
  290.  
  291.         move.l          #$A2F9836E,(FP_SCR0_HI,a0)
  292.         move.l          #$4E44152A,(FP_SCR0_LO,a0)
  293.         move.w          d2,(FP_SCR0_EX,a0)      ; FP_SCR0 = 2**(-L)*(2/PI)
  294.  
  295.         fmove.x         fp0,fp2
  296.         fmul.x          (FP_SCR0,a0),fp2        ; fp2 = X * 2**(-L)*(2/PI)
  297.  
  298. ;--WE MUST NOW FIND INT(FP2). SINCE WE NEED THIS VALUE IN
  299. ;--FLOATING POINT FORMAT, THE TWO FMOVE'S       FMOVE.L FP <--> N
  300. ;--WILL BE TOO INEFFICIENT. THE WAY AROUND IT IS THAT
  301. ;--(SIGN(INARG)*2**63   +       FP2) - SIGN(INARG)*2**63 WILL GIVE
  302. ;--US THE DESIRED VALUE IN FLOATING POINT.
  303.         move.l          a1,d2
  304.         swap            d2
  305.         and.l           #$80000000,d2
  306.         or.l            #$5F000000,d2           ; d2 = SIGN(INARG)*2**63 IN SGL
  307.         move.l          d2,(TWOTO63,a0)
  308.         fadd.s          (TWOTO63,a0),fp2        ; THE FRACTIONAL PART OF FP1 IS ROUNDED
  309.         fsub.s          (TWOTO63,a0),fp2        ; fp2 = N
  310. ;       fintrz.x        fp2,fp2
  311.  
  312. ;--CREATING 2**(L)*Piby2_1 and 2**(L)*Piby2_2
  313.         move.l          d1,d2                   ; d2 = L
  314.  
  315.         add.l           #$00003FFF,d2           ; BIASED EXP OF 2**L * (PI/2)
  316.         move.w          d2,(FP_SCR0_EX,a0)
  317.         move.l          #$C90FDAA2,(FP_SCR0_HI,a0)
  318.         clr.l           (FP_SCR0_LO,a0)         ; FP_SCR0 = 2**(L) * Piby2_1
  319.  
  320.         add.l           #$00003FDD,d1
  321.         move.w          d1,(FP_SCR1_EX,a0)
  322.         move.l          #$85A308D3,(FP_SCR1_HI,a0)
  323.         clr.l           (FP_SCR1_LO,a0)         ; FP_SCR1 = 2**(L) * Piby2_2
  324.  
  325.         move.b          (ENDFLAG,a0),d1
  326.  
  327. ;--We are now ready to perform (R+r) - N*P1 - N*P2, P1 = 2**(L) * Piby2_1 and
  328. ;--P2 = 2**(L) * Piby2_2
  329.         fmove.x         fp2,fp4                 ; fp4 = N
  330.         fmul.x          (FP_SCR0,a0),fp4        ; fp4 = W = N*P1
  331.         fmove.x         fp2,fp5                 ; fp5 = N
  332.         fmul.x          (FP_SCR1,a0),fp5        ; fp5 = w = N*P2
  333.         fmove.x         fp4,fp3                 ; fp3 = W = N*P1
  334.  
  335. ;--we want P+p = W+w  but  |p| <= half ulp of P
  336. ;--Then, we need to compute  A := R-P   and  a := r-p
  337.         fadd.x          fp5,fp3                 ; fp3 = P
  338.         fsub.x          fp3,fp4                 ; fp4 = W-P
  339.  
  340.         fsub.x          fp3,fp0                 ; fp0 = A := R - P
  341.         fadd.x          fp5,fp4                 ; fp4 = p = (W-P)+w
  342.  
  343.         fmove.x         fp0,fp3                 ; fp3 = A
  344.         fsub.x          fp4,fp1                 ; fp1 = a := r - p
  345.  
  346. ;--Now we need to normalize (A,a) to  "new (R,r)" where R+r = A+a but
  347. ;--|r| <= half ulp of R.
  348.         fadd.x          fp1,fp0                 ; fp0 = R := A+a
  349. ;--No need to calculate r if this is the last loop
  350.         tst.b           d1
  351.         bgt.w           .RESTORE
  352.  
  353. ;--Need to calculate r
  354.         fsub.x          fp0,fp3                 ; fp3 = A-R
  355.         fadd.x          fp3,fp1                 ; fp1 = r := (A-R)+a
  356.         bra.w           .LOOP
  357.  
  358. .RESTORE
  359.         fmove.l         fp2,(INT,a0)
  360.         move.l          (INT,a0),d1
  361.         unlk            a0
  362.         move.l          (sp)+,d2                ; restore d2
  363.         fmovem.x        (sp)+,fp2-fp5           ; restore {fp2-fp5}
  364.         ror.l           #1,d1
  365.         bra.w           .TANCONT
  366.  
  367. _cot
  368.         fmove.d         (4,sp),fp0
  369. @cot
  370.         fadd.d          (PID2,pc),fp0
  371.         bsr.w           @tan
  372.         fneg.x          fp0
  373.         rts
  374.